import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
def xgb_model_init(X_train,y_train):
xgb_clf = XGBClassifier(objective='binary:logistic', eval_metric='auc')
xgb_clf.fit(X_train,y_train)
return xgb_clf
def xgb_model_tuned(X_train,y_train):
grid_search = {
'max_depth': [5,8,10],
'min_child_weight': [4],
'learning_rate': [0.1,0.003,0.001],
'n_estimators': [50,100]}
clf = XGBClassifier(objective='binary:logistic', eval_metric='auc')
grid = GridSearchCV(estimator = clf, param_grid = grid_search,
cv = 4, verbose= 5, n_jobs = -1)
grid.fit(X_train,y_train)
xgb_model=grid.best_estimator_
return xgb_model
data = pd.read_csv('preprocessing1.csv', encoding = 'cp949', index_col=0 )
seed = 5764
target=data['Status']
data.drop(['Status'],axis=1,inplace = True)
# 학습, 테스트 데이터 분리 (0.7:0.3)
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=seed)
xgb_model = xgb_model_tuned(x_train,y_train)
Fitting 4 folds for each of 18 candidates, totalling 72 fits
df_x_test = pd.DataFrame(x_test)
y_hat = xgb_model.predict(x_test)
list1 = []
list0 = []
list_error = []
for i in range(len(x_test)):
a = df_x_test.index[i]
if y_hat[i] == target[a] ==1:
list1.append(i)
elif y_hat[i] == target[a] == 0:
list0.append(i)
else:
list_error.append(i)
for i in range(len(list1)):
print(list1[i])
xgb_lime_explanation = xgb_lime_explainer.explain_instance(x_test.iloc[list1[i]], xgb_model.predict_proba)
xgb_lime_explanation.show_in_notebook(show_table=True)
for i in range(len(list0)):
print(list0[i])
xgb_lime_explanation = xgb_lime_explainer.explain_instance(x_test.iloc[list0[i]], xgb_model.predict_proba)
xgb_lime_explanation.show_in_notebook(show_table=True)
import numpy as np
from lime import lime_tabular
xgb_lime_explainer = lime_tabular.LimeTabularExplainer(np.array(x_train),feature_names = data.columns , mode="classification")
xgb_lime_explanation = xgb_lime_explainer.explain_instance(x_test.iloc[1], xgb_model.predict_proba)
xgb_lime_explanation.show_in_notebook(show_table=True)
xgb_lime_explanation = xgb_lime_explainer.explain_instance(x_test.iloc[900], xgb_model.predict_proba)
xgb_lime_explanation.show_in_notebook(show_table=True)
xgb_lime_explanation = xgb_lime_explainer.explain_instance(x_test.iloc[910], xgb_model.predict_proba)
xgb_lime_explanation.show_in_notebook(show_table=True)
xgb_lime_explanation = xgb_lime_explainer.explain_instance(x_test.iloc[4], xgb_model.predict_proba)
xgb_lime_explanation.show_in_notebook(show_table=True)
xgb_lime_explanation = xgb_lime_explainer.explain_instance(x_test.iloc[14], xgb_model.predict_proba)
xgb_lime_explanation.show_in_notebook(show_table=True)
import shap
xgb_shap_explainer = shap.TreeExplainer(xgb_model)
shap_values = xgb_shap_explainer.shap_values(x_test)
ntree_limit is deprecated, use `iteration_range` or model slicing instead.
shap.initjs()
shap.force_plot(xgb_shap_explainer.expected_value, shap_values[900, :], x_test.iloc[900, :])
shap.initjs()
shap.force_plot(xgb_shap_explainer.expected_value, shap_values[910, :], x_test.iloc[910, :])
shap.initjs()
shap.force_plot(xgb_shap_explainer.expected_value, shap_values[4, :], x_test.iloc[4, :])
shap.initjs()
shap.force_plot(xgb_shap_explainer.expected_value, shap_values[14, :], x_test.iloc[14,:])
shap.summary_plot(shap_values, x_test.astype("float") ,plot_size = 1.0)
shap.summary_plot(shap_values, x_test,plot_type = 'bar')